/*******************************************************************************

This code file cleans the PLUTO land use database file for 2019.

An important step in this code is that individual condos on 421a receive their 
own BBL codes which do not appear in PLUTO and thus create apparent merge 
failures. Only the entire condo building appears in PLUTO, under its own BBL.
To fix this, I find the corresponding "master BBL" using the PAD.

*******************************************************************************/

*** Manage settings

	run "~/Dropbox (MIT)/Research/NYC421a/code/modules/settings.do"
	
*** Import PLUTO data and merge into 421a dataset
	
	* Import PLUTO
	import delimited "$data/raw/nyc_pluto_19v2_csv/pluto_19v2.csv", encoding(ISO-8859-1) clear

	* Drop unused variables
	drop borough rpaddate dcasdate zoningdate landmkdate basempdate masdate ///
		polidate edesigdate geom spdist3 version owner* areasource plutomapid ///
		appbbl appdate ext zonemap zmcode sanborn taxmap edesignum mappluto_f ///
		dcpedited condono pfirm15_flag healthcenterdistrict tract2010 landmark ///
		commfar facilfar sanitsub sanitboro sanitdistrict easements healtharea ///
		firecomp spdist2 ltdheight splitzone overlay1 overlay2 comarea ///
		officearea retailarea garagearea strgearea factryarea otherarea
		
	* Clean borough variable
	rename borocode borough
	label define borough_label 1 "Manhattan" 2 "Bronx" 3 "Brooklyn" 4 "Queens" 5 "Staten Island"
	label values borough borough_label
	
	* Remove leading/trailing spaces on addresses
	replace address = strtrim(address)
	
	* Fix lot dimensions
	bys borough block address: egen lotfront_ = max(lotfront) 
	replace lotfront = lotfront_ if lotfront == 0
	bys borough block address: egen lotdepth_ = max(lotdepth) 
	replace lotdepth = lotdepth_ if lotdepth == 0
	drop lotfront_ lotdepth_
	
	* Merge into cleaned 2002 PLUTO data
	
		format bbl %18.0f
	
		merge 1:1 bbl using "$data/clean/nyc0219.dta", nogen keep(1 3)

	* Merge into permits data 
	* Note: Condo BBL issue explains _m==2 entries (delete b/c matched on condo BBL appended file)
		
		preserve
		merge m:1 bbl using "$data/clean/permits_cleaned.dta", nogen keep(3)
		tempfile matched
		save `matched', replace
		restore
		
	* Merge into remaining permits data (these BINs cannot be crosswalked to BBLs)
	
		merge m:1 bbl using "$data/clean/permits_cleaned.dta", nogen keep(1)
		merge m:1 borough block address using "$data/raw/unmatched_permits.dta", nogen keep(3 4) update
		merge m:1 bbl using "$data/clean/assessments_cleaned.dta", nogen keep(1 3 4) update
		
		append using `matched'
		
	* Clean up unmatched_permits.dta file
	erase "$data/raw/unmatched_permits.dta"
			
	* Merge in GEA status
	merge m:1 bbl using "$data/clean/gea_bbls.dta", nogen keep(1 3)
						
	* Drop units with nonresidential building class
	drop if landuse == 1 | (landuse >= 5 & !missing(landuse))
	drop if unitsres == 0
		
	* Process zoning districts
		
		* Keep residential zoning code
		gen zonedist = zonedist1 if strpos(zonedist1,"R") > 0 & strpos(zonedist1,"PARK") == 0
		replace zonedist = zonedist2 if strpos(zonedist1,"R") == 0 & strpos(zonedist2,"R") > 0 & strpos(zonedist2,"PARK") == 0
		replace zonedist = zonedist3 if strpos(zonedist1,"R") == 0 & strpos(zonedist2,"R") == 0 & strpos(zonedist3,"R") > 0  & strpos(zonedist3,"PARK") == 0
		replace zonedist = zonedist4 if strpos(zonedist1,"R") == 0 & strpos(zonedist2,"R") == 0 & strpos(zonedist3,"R") == 0 & strpos(zonedist4,"R") > 0 & strpos(zonedist4,"PARK") == 0
		replace zonedist = zonedist1 if missing(zonedist)
		drop zonedist1 zonedist2 zonedist3 zonedist4
		split zonedist, parse("/")
		replace zonedist = zonedist2 if strpos(zonedist1,"R") == 0 & strpos(zonedist2,"R") > 0
		drop zonedist1 zonedist2	
		
		* If residential zoning code not listed, convert using commercial zoning code equivalency
		merge m:1 zonedist using "$data/raw/zoning_dist_conversion.dta", nogen keep(1 3)
		replace zonedist = zonedist_to if !missing(zonedist_to)
		drop zonedist_to
	
	* Process lot types
	replace lottype = . if lottype == 8 | lottype == 9
	
	* Identify condos
	gen condo = 0
	replace condo = 1 if bldgclass == "R0" | bldgclass == "R1" | bldgclass == "R2" | bldgclass == "R3" | bldgclass == "R4" | bldgclass == "R5" | bldgclass == "R6" | bldgclass == "R7" | bldgclass == "R8" | bldgclass == "R9" | bldgclass == "RA" | bldgclass == "RB" | bldgclass == "RC" | bldgclass == "RD" | bldgclass == "RG" | bldgclass == "RH" | bldgclass == "RI" | bldgclass == "RK" | bldgclass == "RM" | bldgclass == "RM" | bldgclass == "RP" | bldgclass == "RR" | bldgclass == "RS" | bldgclass == "RW" | bldgclass == "RX" | bldgclass == "RZ"
		
	* Code GEA variables: original, expansion, by-year
	gen gea_original = 0
	replace gea_original = 1 if missing(gea_expansion) & borough == 1
	replace gea_expansion = 0 if missing(gea_expansion)
	gen gea = gea_original
	replace gea = max(gea_original,gea_expansion) if year(issuance_date)>=2006
	
	* Merge into PAD xwalk
	mmerge bbl using "$data/clean/xwalk_condo_bbl.dta"
	drop _m
		
	* Identify "master BBL" for individual condos
	drop if missing(address)
	duplicates t address block borough yearbuilt, gen(tempflag)
	egen tempgrp = group(address block borough yearbuilt)
	gen double master_bbl = bbl
	bys tempgrp: gegen double master_bbl_ = min(master_bbl)
	format master_bbl %18.0f
	replace master_bbl = master_bbl_ if tempflag > 0 
	drop master_bbl_ tempgrp tempflag
	
	* Make adjustments to master BBL as in PAD
	bys master_bbl: gegen double bbl_new_ = max(bbl_new)
	format bbl_new_ %18.0f
	replace master_bbl = bbl_new_ if master_bbl != bbl_new_ & !missing(bbl_new_)
	drop lot_new bbl_new bbl_new_
	
*** Clean data	

	* Drop only a tiny handful of cases where different BBLs appear to match to the same unit, with different tax treatment
	gen problem = 0
	foreach v of varlist flag10uncap flag15uncap flag25uncap flag20uncap flag10cap flag15cap {
		bys master_bbl: gegen `v'_ = mean(`v')
		replace problem = 1 if `v'_ != 0 & `v'_ != 1 & !missing(`v'_)
	}
	drop if problem == 1
	drop problem flag10uncap_ flag15uncap_ flag25uncap_ flag20uncap_ flag10cap_ flag15cap_

	* Code 421a receipt type by master BBL
	gen receipt_421a = 0
	replace receipt_421a = 1 if flag10uncap == 1
	replace receipt_421a = 2 if flag15uncap == 1
	replace receipt_421a = 3 if flag25uncap == 1
	replace receipt_421a = 4 if flag20uncap == 1
	replace receipt_421a = 5 if flag10cap == 1
	replace receipt_421a = 6 if flag15cap == 1
	replace receipt_421a = 7 if flag35 == 1
		
	bys master_bbl: gegen receipt_421a_ = mean(receipt_421a)
	replace receipt_421a = receipt_421a_
	drop receipt_421a_
	
	label define receipt_421a_label 0 "Not in 421a" 1 "10 Years, Uncapped" 2 "15 Years, Uncapped" 3 "25 Years, Uncapped" 4 "20 Years, Uncapped" 5 "10 Years, Capped" 6 "15 Years, Capped"
	label values receipt_421a receipt_421a_label
	drop flag10uncap flag15uncap flag25uncap flag20uncap flag10cap flag15cap
	
	* Clean CDs and other local govt unit identifiers by master BBL
	foreach v of varlist cd cb2010 schooldist council zipcode ///
			policeprct xcoord ycoord ///
			landuse {
			
		bys master_bbl: gegen `v'_ = mean(`v')
		replace `v' = `v'_ if missing(`v')
		drop `v'_
		
	}
	foreach v of varlist zonedist spdist1 bldgclass proxcode histdist ct2010 lottype bsmtcode condo {
				
		bys master_bbl: egen `v'_ = mode(`v')
		replace `v' = `v'_ if missing(`v')
		drop `v'_
	}

	* Scale up tract numbers for ".00" tracts
	replace ct2010 = round(ct2010*100)
		
	* Recode basement variable
	gen basement = 0 if bsmtcode == 0
	replace basement = 1 if bsmtcode > 0 & bsmtcode < 5
	replace basement = 2 if bsmtcode == 5
	drop bsmtcode
	
	* Collapse to master BBL level
	collapse (sum) unitsres unitstotal unitsres_2002 unitstotal_2002 ///
		(max) receipt_421a firm07_flag gea* condo lotarea bldgarea resarea floorarea_2002 resarea_2002 ///
		(min) yearbuilt issuance_date yearbuilt_2002 ///
		(sum) assessland assesstot assessland_2002 assesstot_2002 ///
		(firstnm) address zonedist cd ct2010 cb2010 schooldist council permittee_id permitteesbusinessname ///
				zipcode policeprct xcoord ycoord landuse proxcode lottype ///
				basement spdist1 bldgclass numbldgs numfloors lotfront lotdepth ///
				bldgfront bldgdepth residfar exempttot yearalter1 yearalter2 histdist builtfar ///
				zonedist_2002 bldgclass_2002, ///
		by(master_bbl borough block)
			
	* Rename variables
	rename firm07_flag floodzone
	replace floodzone = 0 if missing(floodzone)
	
	* Relabel variables
	label values receipt_421a receipt_421a_label
	label values borough borough_label
	
	* Label lot type
	label define lottype_label 0 "Unknown" 1 "Entire Block" 2 "Waterfront" 3 "Corner" 4 "Through" 5 "Inside" 6 "Interior Lot"
	label values lottype lottype_label 
	
	rename master_bbl bbl
	rename bldgclass bldgclass_det
	rename landuse bldgclass_broad
	
	* Patches for missing Census tracts
	bys borough block: egen ct2010_ = mode(ct2010), minmode
	replace ct2010 = ct2010_ if missing(ct2010)
	drop ct2010_
	
	* Recode special purpose district to indicator
	gen spdist = !missing(spdist1)
	drop spdist1
	
	* Encode zoning district variable
	replace zonedist = "Special District" if spdist == 1
	encode zonedist, gen(zonedist_)
	drop zonedist
	rename zonedist_ zonedist
	
	* Generate assessment fraction variable
	gen frac_assess = .
	replace frac_assess = 0.06 if unitstotal < 4
	replace frac_assess = 0.45 if unitstotal >= 4 & !missing(unitstotal)
	
	merge 1:m borough bbl using "$data/raw/error_files/patch_ct2010_missing.dta", nogen keep(1 3 4) update
	
	* Merge in NTAs
	merge m:1 ct2010 borough using "$data/clean/ct2010_to_nta.dta", nogen keep(1 3)
	
	* Merge in Census characteristics
	merge m:1 borough ct2010 cb2010 using "$data/clean/census_area_characteristics.dta", nogen keep(1 3)
			
	* Add latitude-longitude coordinates
	merge 1:1 bbl using "$data/raw/coordinates_latlong.dta", nogen keep(1 3)
	rename xcoord xcoord_spc
	rename ycoord ycoord_spc
	rename latitude ycoord_latlong
	rename longitude xcoord_latlong

	* Drop old buildings (misclassified by permits)
	merge 1:1 bbl using "$data/raw/error_files/old_bbls_to_drop.dta", nogen keep(1)
	
	* Fix data entry errors in PLUTO's residential unit counts
	merge 1:1 bbl using "$data/raw/error_files/pluto_data_errors.dta", update replace nogen keep(1 3 4 5)
	
	* Recode historical district variable
	encode histdist, gen(histdistrict)
	drop histdist
	rename histdistrict histdist_name
	gen histdist = !missing(histdist_name)
	
	* Replace incorrect year built data (misses teardowns)
	replace yearbuilt = yearalter1 if yearbuilt <= 2000 & yearalter1 >= 2000
	replace yearbuilt = yearalter2 if yearbuilt <= 2000 & yearalter2 >= 2000
	
	* Replace missing permit issuance years
	gen yearpermit = year(issuance_date)
	replace yearpermit = min(yearbuilt, 2015) if missing(yearpermit)
	
	* Drop ineligible units
	drop if unitsres < 3
	drop if unitsres < 4 & yearpermit >= 2008
	
	* Identify buildings in non-GEA Manhattan below 110th Street
	gen sub_gea = 0
	replace sub_gea = 1 if borough == 1 & gea_original == 0
	replace sub_gea = 0 if nta_code == "MN11" | nta_code == "MN03" | ///
		nta_code == "MN06" | nta_code == "MN04" | nta_code == "MN36" | ///
		nta_code == "MN35" | nta_code == "MN01"
	replace sub_gea = 0 if nta_code == "MN09" & ct2010 != 19500 & ct2010 != 19300
	replace sub_gea = 0 if nta_code == "MN34" & ct2010 != 180 & cb2010 != 3001 & cb2010 != 3002 & cb2010 != 2001 & cb2010 != 2002

*** 421-a variables

	gen any421a = receipt>0 if !missing(receipt)
	gen inclusionary_offsite = receipt == 1 | receipt == 2 if !missing(receipt)
	gen inclusionary_onsite = receipt == 3 | receipt == 4 | receipt == 7 if !missing(receipt)
	
*** Developer history

	preserve
	
	collapse (sum) unitsres, by(permittee_id inclusionary_onsite)
	reshape wide unitsres, i(permittee_id) j(inclusionary_onsite)
	
	format permittee_id %18.0f
	replace unitsres1 = 0 if missing(unitsres1)
	
	tempfile developer_history
	save `developer_history', replace
	
	restore
	merge m:1 permittee_id using `developer_history', nogen
	
	gen sh_incl_dev_lo = (unitsres1-unitsres*inclusionary_onsite)/(unitsres0+unitsres1-unitsres)
	drop unitsres0 unitsres1
	
	replace permitteesbusinessname = "UNKNOWN" if missing(permitteesbusinessname)
	bys permittee_id: egen permitteesbusinessname_ = mode(permitteesbusinessname)
	labmask permittee_id, values(permitteesbusinessname_)
	drop permitteesbusinessname permitteesbusinessname_

*** Useful tranformations of variables

	gen lassessland = ln(assessland)
	gen llotarea = ln(lotarea)
	gen llotfront = ln(lotfront)
	gen llotdepth = ln(lotdepth)
			
*** Label variables

	label variable block "Block (BBL)"
	label variable borough "Borough (name, encoded w/ BBL)"
	label variable bbl "Borough-Block-Lot (BBL) code"
	label variable unitsres "Residential Units"
	label variable unitstotal "Total Units (incl. commercial)"
	label variable receipt_421a "Receives 421-a benefit (categorical)"
	label variable floodzone "In 2007 Flood Zone"
	label variable gea_expansion "In GEA Expansion Area"
	label variable gea_original "In GEA Original Area"
	label variable gea "In GEA in Year of Permit Issuance"
	local variable sub_gea "In non-GEA Manhattan below 110th Street"
	label variable yearbuilt "Year Built (PLUTO)"
	label variable issuance_date "Date of Permit Issuance (DOB)"
	label variable yearpermit "Year Permitted (DOB)"
	label variable assessland "Assessed Value, Land"
	label variable assesstot "Assessed Value, Total"
	label variable exempttot "Exempt Value, Total"
	label variable nta "Neighborhood Tabulation Area (encoded string)"
	label variable nta_code "Neighborhood Tabulation Area Code (string)"
	label variable address "Building Address (string)"
	label variable zonedist "Zoning District"
	label variable cd "Community District (NYC DCP)"
	label variable ct2010 "2010 Census Tract"
	label variable cbg2010 "2010 Census Block Group"
	label variable cb2010 "2010 Census Block"
	label variable schooldist "School District"
	label variable council "City Council District"
	label variable xcoord_latlong "Longitude"
	label variable ycoord_latlong "Latitude"
	label variable xcoord_spc "Longitude (State Plane Coordinates)"
	label variable ycoord_spc "Latitude (State Plane Coordinates)"
	label variable histdist "In Historical District"
	label variable histdist_name "Historical District (encoded name)"
	label variable builtfar "Floor Area Ratio, Built"
	label variable residfar "Floor Area Ratio, Residential Max"
	label variable yearalter1 "Year Altered (if any, first)"
	label variable yearalter2 "Year Altered (if any, second)"
	label variable zipcode "ZIP Code"
	label variable policeprct "Police Precinct Number"
	label variable proxcode "Proximity (detached/attached)"
	label variable bldgclass_det "Building Class, detailed"
	label variable bldgclass_broad "Building Class, broad"
	label variable numbldgs "Number of Buildings"
	label variable numfloors "Number of Floors"
	label variable lotfront "Lot Frontage (feet)"
	label variable lotdepth "Lot Depth (feet)"
	label variable bldgfront "Building Frontage (feet)"
	label variable bldgdepth "Building Depth (feet)"
	label variable basement "Has Basement"
	label variable spdist "In Special District"
	label variable lottype "Lot Type"
	label variable lotarea "Lot Area (sqft)"
	label variable bldgarea "Floor Area, Total (sqft)"
	label variable resarea "Floor Area, Residential (sqft)"
	label variable condo "Is a Condo Building"
	label variable sh_lths "Share Less Than HS (block group)"
	label variable sh_hsdeg "Share HS Degree (block group)"
	label variable sh_somecoll "Share Some College (block group)"
	label variable sh_coll "Share bachelor's Begree (block group)"
	label variable sh_postgrad "Share More Than Bachelor's (block group)"
	label variable sh_drive "Share Car Commuter (block group)"
	label variable sh_bus "Share Bus Commuter (block group)"
	label variable sh_subway "Share Subway Commuter (block group)"
	label variable sh_walk "Share On-foot Commuter (block group)"
	label variable medage "Median Age (block group)"
	label variable lmedhhinc "Log Median HH Income (block group)"
	label variable lmedgrossrent "Log Median Gross Monthly Rent (block group)"
	label variable lmeangrossrent "Log Mean Gross Monthly Rent (block group)"
	label variable pop "Population (block)"
	label variable ct_occ_renter_units "Number of Occupied Rental Units (block)"
	label variable sh_poor "Share Poor (block group)"
	label variable sh_renter "Share renter (block)"
	label variable sh_hispanic "Share Hispanic (block)"
	label variable sh_white "Share Non-Hispanic White (block)"
	label variable sh_black "Share Black (block)"
	label variable sh_asian "Share Asian (block)"
	label variable sh_vacant "Share Units Vacant (block)"
	label variable inclusionary_onsite "Accepted On-Site Inclusionary Housing"
	label variable inclusionary_offsite "Accepted Off-site Inclusionary Housing"
	label variable dtaxrate_onsite "PDV Difference in Tax Rate from On-Site Inclusionary"
	label variable dtaxrate_offsite "PDV Difference in Tax Rate from Off-Site Inclusionary"
	label variable permittee_id "Permittee identifier"
	label variable lassessland "Log of Assessed Value of Land"
	label variable llotarea "Log of Lot Area (sqft)"
	label variable lotfront "Log of Lot Frontage (feet)"
	label variable lotdepth "Log of Lot Depth (feet)"
	label variable lpopdens "Log of population density (per sq.mi.)"

*** Save data

	save "$data/clean/pluto_merged.dta", replace
